In [425]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import (
RandomForestRegressor, GradientBoostingRegressor,
AdaBoostRegressor
)
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklearn.metrics import r2_score, mean_squared_error
# Load Data
file1 = '/path/to/Gly-data y1 copy.csv'
file2 = '/path/to/Gly-data y2.csv'
df1 = pd.read_csv(file1)
df2 = pd.read_csv(file2)
data = pd.concat([df1, df2], ignore_index=True)
def clean_target(val):
if str(val).strip() in ['xx', 'xx', 'xx']:
return 0.01
try:
return float(val)
except:
return np.nan
data['y1'] = data['y1'].apply(clean_target)
data['y2'] = data['y2'].apply(clean_target)
data = data.dropna(subset=['y1', 'y2'], how='all')
data['y1'] = data['y1'].fillna(0.01)
data['y2'] = data['y2'].fillna(0.01)
X = data.drop(columns=['y1', 'y2'])
y1 = data['y1']
bool_cols = X.select_dtypes(include='bool').columns.tolist()
X[bool_cols] = X[bool_cols].astype(str)
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X.select_dtypes(include=[np.number]).columns.tolist()
X[categorical_cols] = X[categorical_cols].fillna('missing').astype(str)
X[numerical_cols] = X[numerical_cols].apply(pd.to_numeric, errors='coerce')
X[numerical_cols] = X[numerical_cols].fillna(X[numerical_cols].mean())
cat_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])
num_pipeline = Pipeline([
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
preprocessor = ColumnTransformer([
('cat', cat_pipeline, categorical_cols),
('num', num_pipeline, numerical_cols)
])
models = {
"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"SVR": SVR(),
"DecisionTree": DecisionTreeRegressor(),
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
"CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}
X_train, X_val, y1_train, y1_val = train_test_split(X, y1, test_size=0.2, random_state=42)
results = []
for name, model in models.items():
pipeline = Pipeline([
('preprocessing', preprocessor),
('regressor', model)
])
pipeline.fit(X_train, y1_train)
y1_train_pred = pipeline.predict(X_train)
y1_val_pred = pipeline.predict(X_val)
train_r2 = r2_score(y1_train, y1_train_pred)
val_r2 = r2_score(y1_val, y1_val_pred)
train_rmse = mean_squared_error(y1_train, y1_train_pred, squared=False)
val_rmse = mean_squared_error(y1_val, y1_val_pred, squared=False)
results.append({
'Model': name,
'Train R²': train_r2,
'Validation R²': val_r2,
'Train RMSE': train_rmse,
'Validation RMSE': val_rmse
})
plt.figure(figsize=(12, 6))
sns.barplot(data=results_df.melt(id_vars='Model', value_vars=['Train R²', 'Validation R²']),
x='Model', y='value', hue='variable')
plt.title('A-Ratio: R² Scores for Train and Validation')
plt.ylabel('R² Score')
plt.xticks(rotation=45)
plt.legend(title='Dataset', loc='lower right')
plt.tight_layout()
plt.show()
In [444]:
import matplotlib.pyplot as plt
fig_train, train_axes = plt.subplots(5, 2, figsize=(11, 16), dpi=600)
fig_val, val_axes = plt.subplots(5, 2, figsize=(11, 16), dpi=600)
train_axes = train_axes.flatten()
val_axes = val_axes.flatten()
models_y2 = {
"LinearRegression": LinearRegression(),
"Ridge": Ridge(),
"Lasso": Lasso(),
"SVR": SVR(),
"DecisionTree": DecisionTreeRegressor(),
"RandomForest": RandomForestRegressor(n_estimators=100, random_state=42),
"GradientBoosting": GradientBoostingRegressor(n_estimators=100, random_state=42),
"AdaBoost": AdaBoostRegressor(n_estimators=100, random_state=42),
"XGBoost": XGBRegressor(n_estimators=100, random_state=42, verbosity=0),
"CatBoost": CatBoostRegressor(n_estimators=100, verbose=0, random_state=42)
}
y2 = data['y2']
X_train, X_val, y2_train, y2_val = train_test_split(X, y2, test_size=0.2, random_state=42)
for idx, (name, model) in enumerate(models_y2.items()):
pipeline = Pipeline([
('preprocessing', preprocessor),
('regressor', model)
])
pipeline.fit(X_train, y2_train)
y2_train_pred = pipeline.predict(X_train)
y2_val_pred = pipeline.predict(X_val)
train_r2 = r2_score(y2_train, y2_train_pred)
train_rmse = mean_squared_error(y2_train, y2_train_pred, squared=False)
val_r2 = r2_score(y2_val, y2_val_pred)
val_rmse = mean_squared_error(y2_val, y2_val_pred, squared=False)
ax_train = train_axes[idx]
ax_train.scatter(y2_train, y2_train_pred, alpha=0.6, edgecolors='k')
ax_train.plot([y2_train.min(), y2_train.max()], [y2_train.min(), y2_train.max()], 'r--')
ax_train.set_title(f"{name} (Train)")
ax_train.set_xlabel('True Yield')
ax_train.set_ylabel('Predicted Yield')
ax_train.legend([f"R² = {train_r2:.2f}\nRMSE = {train_rmse:.2f}"], loc='lower right')
ax_val = val_axes[idx]
ax_val.scatter(y2_val, y2_val_pred, alpha=0.6, edgecolors='k')
ax_val.plot([y2_val.min(), y2_val.max()], [y2_val.min(), y2_val.max()], 'r--')
ax_val.set_title(f"{name} (Validation)")
ax_val.set_xlabel('True Yield')
ax_val.set_ylabel('Predicted Yield')
ax_val.legend([f"R² = {val_r2:.2f}\nRMSE = {val_rmse:.2f}"], loc='lower right')
fig_train.tight_layout()
fig_val.tight_layout()
fig_train.suptitle("Training: Actual vs Predicted Yield (y2)", fontsize=18, y=1.02)
fig_val.suptitle("Validation: Actual vs Predicted Yield (y2)", fontsize=18, y=1.02)
plt.show()